Data Munging



In [1]:

    
import pandas as pd
def time_convert(x):
    """ Converti una stringa dal formato hh:mm:ss in nu"""
    try:
        times = x.split(':')
        return (3600*int(times[0])+60*int(times[1]))+int(times[2])
    except:
        return float('nan')
    
def ReadParseData(filename):
    # E` necessario convertire il tempo di gara in secondi, per poterlo confrontare nelle regressioni
    Cs = {'Official Time': time_convert, '5K': time_convert, 'M/F': lambda x: int(x == 'M')}    
    # EQUIVALENTE A:
    #Cs = dict() # oppure Cs = {}
    #Cs['Official Time'] = time_convert
    #Cs['M/F'] = lambda x: int(x == 'M')
    
    # Leggere la documentazione di "read_csv":
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    bm = pd.read_csv(filename, converters=Cs)
    
    # SCARTA LE COLONNE CHE NON SERVONO
    #bm.drop(bm.columns[[0,1,4,5,8,9]], axis=1, inplace=True)
    #bm.drop(bm.columns[[2,3,4,5,6,7,8,9,10,11,12,13]], axis=1, inplace=True)
    # OPPURE: Dedici quali serie tenere
    bm = bm[['Age','M/F','5K', 'Official Time','Overall','Gender','Division']]
    
    # Remove
    print('Numero dati PRIMA del preprocessing:', len(bm))
    bm = bm[bm['5K'] > 0]
    print('Numero dati DOPO il preprocessing:', len(bm))
    return bm
    
bm = ReadParseData('./data/marathon_results_2016.csv')
# STAMPA LE PRIME 3 RIGHE DEL DATA FRAME
#bm[:3]
bm[27:36]









    



Numero dati PRIMA del preprocessing: 26630
Numero dati DOPO il preprocessing: 26578






    Out[1]:






  
    
      
      Age
      M/F
      5K
      Official Time
      Overall
      Gender
      Division
    
  
  
    
      27
      24
      1
      1045.0
      8949
      28
      28
      26
    
    
      28
      29
      0
      1102.0
      8959
      29
      1
      1
    
    
      29
      24
      1
      1002.0
      8963
      30
      29
      27
    
    
      30
      34
      1
      1036.0
      8972
      31
      30
      28
    
    
      31
      28
      1
      1056.0
      8991
      32
      31
      29
    
    
      32
      31
      0
      1102.0
      9003
      33
      2
      2
    
    
      33
      24
      1
      1043.0
      9022
      34
      32
      30
    
    
      34
      27
      1
      1032.0
      9027
      35
      33
      31
    
    
      35
      27
      0
      1102.0
      9050
      36
      3
      3

Explanatory data analysis (statistica descrittiva)



In [2]:

    
import numpy as np
import matplotlib.pyplot as plt

def ScatterPlot(bm, Feature1, Feature2):
    sub = bm.copy()
    
    # Seleziona feature da plottare
    ym = sub[(sub['M/F'] == 1)][Feature1]
    xm = sub[(sub['M/F'] == 1)][Feature2]

    yf = sub[sub['M/F'] == 0][Feature1]
    xf = sub[sub['M/F'] == 0][Feature2]

    # Disegna il plot
    fig, ax = plt.subplots(figsize=(13, 7))

    ax.scatter(xm, ym, alpha=0.2, c='blue')
    ax.scatter(xf, yf, alpha=0.2, c='red')

    ax.legend(('Male', 'Female'))
    plt.show()
    
ScatterPlot(bm, 'Official Time', 'Gender')
ScatterPlot(bm, 'Official Time', 'Age')



In [3]:

    
import numpy as np
import matplotlib.pyplot as plt

def FilterPlot(F1, F2, threshold):
    # Filtra il dataframe
    sub = bm[bm.Gender < threshold]
    
    ym = sub[(sub['M/F'] == 1)][F1]
    xm = sub[(sub['M/F'] == 1)][F2]

    yf = sub[sub['M/F'] == 0][F1]
    xf = sub[sub['M/F'] == 0][F2]

    # Disegna il plot
    fig, ax = plt.subplots(figsize=(13, 7))

    ax.scatter(xm, ym, alpha=0.3, c='blue')
    ax.scatter(xf, yf, alpha=0.3, c='red')

    ax.legend(('Male', 'Female'))
    plt.show()
    
FilterPlot('Official Time', 'Age', 5000)
FilterPlot('M/F', 'Official Time', 5000)



In [4]:

    
import seaborn as sns

def PlotStrip(bm, threshold=1000):
    # Filtra il dataframe
    sub = bm[bm.Gender < threshold]    
    sns.stripplot(y='Official Time', x='M/F', data=sub, jitter=True)
    sns.plt.show()

PlotStrip(bm)

Supervised Learning: Classificazione tramite Regressione



In [5]:

    
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

def GenerateTrainTestSet(bm, Fs, F2, threshold=200000):
    sub = bm[bm.Gender < threshold]
    x_train, x_test, y_train, y_test = train_test_split(sub[Fs], sub[F2], random_state=0)
    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = GenerateTrainTestSet(bm, ['Official Time'], 'M/F')



In [6]:

    
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

def PrintEvaluation(y_test, y_pred):
    try:
        print('MAE:', mean_absolute_error(y_test, y_pred))
        print('MSE:', mean_squared_error(y_test, y_pred))
        print('R2:', r2_score(y_test, y_pred))    
        print('ACCURACY:', accuracy_score(y_test, y_pred))
        print('REPORT:',classification_report(y_test, y_pred))
        print('CM:', confusion_matrix(y_test, y_pred))
    except:
        print('Errore nel calcolo delle statistiche: Debug il tuo codice')



In [7]:

    
def PlotPredictions(x_test, y_test, y_pred):
    # Plot valori di test
    fig, ax = plt.subplots(figsize=(13, 7))
    ax.scatter(x_test, y_test, alpha=0.3, c='blue')
    
    # Plot valori predetti
    fig, ax = plt.subplots(figsize=(13, 7))
    ax.scatter(x_test, y_pred, alpha=0.3, c='red')

    plt.show()

Regressione Lineare



In [8]:

    
from sklearn.linear_model import LinearRegression    

def RunLinearRegression(x_train, x_test, y_train):
    lr = LinearRegression(normalize=False)
    # Input to this function must be "DataFrames"
    lr.fit(x_train, y_train)
    y_pred = lr.predict(x_test)
    y_pred = [1 if p > 0.5 else 0 for p in y_pred]
    return y_pred

y_pred = RunLinearRegression(x_train, x_test, y_train)
PlotPredictions(x_test, y_test, y_pred)
PrintEvaluation(y_test, y_pred)









    












    












    



MAE: 0.432204665162
MSE: 0.432204665162
R2: -0.736748046112
ACCURACY: 0.567795334838
REPORT:              precision    recall  f1-score   support

          0       0.56      0.35      0.43      3098
          1       0.57      0.76      0.65      3547

avg / total       0.57      0.57      0.55      6645

CM: [[1093 2005]
 [ 867 2680]]



In [9]:

    
import seaborn as sns
sns.jointplot(data=bm, x='Official Time', y='M/F', kind='reg', color='g')
sns.plt.show()









    



C:\Users\gualandi\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

Regressione Logistica



In [10]:

    
from sklearn.linear_model import LogisticRegression

def RunLogisticRegression(x_train, x_test, y_train):
    logit = LogisticRegression(penalty='l2', class_weight='balanced')
    # Input to this function must be "DataFrames"
    logit.fit(x_train, y_train)
    y_pred = logit.predict_proba(x_test)
    print(y_pred[:3])
    y_pred = [1 if p[0] < p[1] else 0 for p in y_pred]
    
    return y_pred
    
y_pred = RunLogisticRegression(x_train, x_test, y_train)
PrintEvaluation(y_test, y_pred)









    



[[ 0.41502752  0.58497248]
 [ 0.44816276  0.55183724]
 [ 0.4881548   0.5118452 ]]
MAE: 0.393227990971
MSE: 0.393227990971
R2: -0.580126268974
ACCURACY: 0.606772009029
REPORT:              precision    recall  f1-score   support

          0       0.58      0.57      0.57      3098
          1       0.63      0.64      0.64      3547

avg / total       0.61      0.61      0.61      6645

CM: [[1753 1345]
 [1268 2279]]



In [11]:

    
sns.jointplot(data=bm, x='Official Time', y='M/F', kind='reg', color='g', logistic=True)
sns.plt.show()









    



C:\Users\gualandi\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

Nearest Neighbours



In [12]:

    
from sklearn import neighbors

def RunNeighborClassifier(x_train, x_test, y_train):
    knn = neighbors.KNeighborsClassifier(n_neighbors=5)
    # Input to this function must be "DataFrames"
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    return y_pred

y_pred = RunNeighborClassifier(x_train, x_test, y_train)
PrintEvaluation(y_test, y_pred)









    



MAE: 0.41625282167
MSE: 0.41625282167
R2: -0.67264801377
ACCURACY: 0.58374717833
REPORT:              precision    recall  f1-score   support

          0       0.55      0.55      0.55      3098
          1       0.61      0.61      0.61      3547

avg / total       0.58      0.58      0.58      6645

CM: [[1718 1380]
 [1386 2161]]

	Age	M/F	5K	Official Time	Overall	Gender	Division
27	24	1	1045.0	8949	28	28	26
28	29	0	1102.0	8959	29	1	1
29	24	1	1002.0	8963	30	29	27
30	34	1	1036.0	8972	31	30	28
31	28	1	1056.0	8991	32	31	29
32	31	0	1102.0	9003	33	2	2
33	24	1	1043.0	9022	34	32	30
34	27	1	1032.0	9027	35	33	31
35	27	0	1102.0	9050	36	3	3